import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sb
import plotly.express as px
from ydata_profiling import ProfileReport
import math
from scipy import stats
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error,r2_score,mean_squared_error
from yellowbrick.classifier import ClassificationReport, ClassPredictionError
from statsmodels.tools.tools import pinv_extended
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm
print("Hello World!!")
Hello World!!
data = pd.read_csv(f'C:/Users/Jason/Desktop/Data analysis/TLC Analysis/gvJoe37aS_6HZqnLUmiajw_592fe280dd804403b6e33fd9d9ffa9f1_2017_Yellow_Taxi_Trip_Data.csv')
data.head()
| Unnamed: 0 | VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | store_and_fwd_flag | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24870114 | 2 | 03/25/2017 8:55:43 AM | 03/25/2017 9:09:47 AM | 6 | 3.34 | 1 | N | 100 | 231 | 1 | 13.00 | 0.00 | 0.50 | 2.76 | 0.00 | 0.30 | 16.56 |
| 1 | 35634249 | 1 | 04/11/2017 2:53:28 PM | 04/11/2017 3:19:58 PM | 1 | 1.80 | 1 | N | 186 | 43 | 1 | 16.00 | 0.00 | 0.50 | 4.00 | 0.00 | 0.30 | 20.80 |
| 2 | 106203690 | 1 | 12/15/2017 7:26:56 AM | 12/15/2017 7:34:08 AM | 1 | 1.00 | 1 | N | 262 | 236 | 1 | 6.50 | 0.00 | 0.50 | 1.45 | 0.00 | 0.30 | 8.75 |
| 3 | 38942136 | 2 | 05/07/2017 1:17:59 PM | 05/07/2017 1:48:14 PM | 1 | 3.70 | 1 | N | 188 | 97 | 1 | 20.50 | 0.00 | 0.50 | 6.39 | 0.00 | 0.30 | 27.69 |
| 4 | 30841670 | 2 | 04/15/2017 11:32:20 PM | 04/15/2017 11:49:03 PM | 1 | 4.37 | 1 | N | 4 | 112 | 2 | 16.50 | 0.50 | 0.50 | 0.00 | 0.00 | 0.30 | 17.80 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 22699 entries, 0 to 22698 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 22699 non-null int64 1 VendorID 22699 non-null int64 2 tpep_pickup_datetime 22699 non-null object 3 tpep_dropoff_datetime 22699 non-null object 4 passenger_count 22699 non-null int64 5 trip_distance 22699 non-null float64 6 RatecodeID 22699 non-null int64 7 store_and_fwd_flag 22699 non-null object 8 PULocationID 22699 non-null int64 9 DOLocationID 22699 non-null int64 10 payment_type 22699 non-null int64 11 fare_amount 22699 non-null float64 12 extra 22699 non-null float64 13 mta_tax 22699 non-null float64 14 tip_amount 22699 non-null float64 15 tolls_amount 22699 non-null float64 16 improvement_surcharge 22699 non-null float64 17 total_amount 22699 non-null float64 dtypes: float64(8), int64(7), object(3) memory usage: 3.1+ MB
data.describe()
| Unnamed: 0 | VendorID | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.269900e+04 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 |
| mean | 5.675849e+07 | 1.556236 | 1.642319 | 2.913313 | 1.043394 | 162.412353 | 161.527997 | 1.336887 | 13.026629 | 0.333275 | 0.497445 | 1.835781 | 0.312542 | 0.299551 | 16.310502 |
| std | 3.274493e+07 | 0.496838 | 1.285231 | 3.653171 | 0.708391 | 66.633373 | 70.139691 | 0.496211 | 13.243791 | 0.463097 | 0.039465 | 2.800626 | 1.399212 | 0.015673 | 16.097295 |
| min | 1.212700e+04 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | -120.000000 | -1.000000 | -0.500000 | 0.000000 | 0.000000 | -0.300000 | -120.300000 |
| 25% | 2.852056e+07 | 1.000000 | 1.000000 | 0.990000 | 1.000000 | 114.000000 | 112.000000 | 1.000000 | 6.500000 | 0.000000 | 0.500000 | 0.000000 | 0.000000 | 0.300000 | 8.750000 |
| 50% | 5.673150e+07 | 2.000000 | 1.000000 | 1.610000 | 1.000000 | 162.000000 | 162.000000 | 1.000000 | 9.500000 | 0.000000 | 0.500000 | 1.350000 | 0.000000 | 0.300000 | 11.800000 |
| 75% | 8.537452e+07 | 2.000000 | 2.000000 | 3.060000 | 1.000000 | 233.000000 | 233.000000 | 2.000000 | 14.500000 | 0.500000 | 0.500000 | 2.450000 | 0.000000 | 0.300000 | 17.800000 |
| max | 1.134863e+08 | 2.000000 | 6.000000 | 33.960000 | 99.000000 | 265.000000 | 265.000000 | 4.000000 | 999.990000 | 4.500000 | 0.500000 | 200.000000 | 19.100000 | 0.300000 | 1200.290000 |
data.columns
Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
'tolls_amount', 'improvement_surcharge', 'total_amount'],
dtype='object')
Top 25 observations by total amount paid
data[["tpep_pickup_datetime", "trip_distance", "fare_amount", "total_amount"]].sort_values(by="total_amount", ascending= False)[:25].style.background_gradient()
| tpep_pickup_datetime | trip_distance | fare_amount | total_amount | |
|---|---|---|---|---|
| 8476 | 2017-02-06 05:50:10 | 2.600000 | 999.990000 | 1200.290000 |
| 20312 | 2017-12-19 09:40:46 | 0.000000 | 450.000000 | 450.300000 |
| 13861 | 2017-05-19 08:20:21 | 33.920000 | 200.010000 | 258.210000 |
| 12511 | 2017-12-17 18:24:24 | 0.000000 | 175.000000 | 233.740000 |
| 15474 | 2017-06-06 20:55:01 | 0.000000 | 200.000000 | 211.800000 |
| 6064 | 2017-06-13 12:30:22 | 32.720000 | 107.000000 | 179.060000 |
| 16379 | 2017-11-30 10:41:11 | 25.500000 | 140.000000 | 157.060000 |
| 3582 | 2017-01-01 23:53:01 | 7.300000 | 152.000000 | 152.300000 |
| 11269 | 2017-06-19 00:51:17 | 0.000000 | 120.000000 | 151.820000 |
| 9280 | 2017-06-18 23:33:25 | 33.960000 | 150.000000 | 150.300000 |
| 1928 | 2017-06-16 18:30:08 | 12.500000 | 120.000000 | 137.800000 |
| 10291 | 2017-09-11 11:41:04 | 31.950000 | 131.000000 | 131.800000 |
| 6708 | 2017-10-30 11:23:46 | 0.320000 | 100.000000 | 126.000000 |
| 11608 | 2017-12-19 17:00:56 | 23.000000 | 99.500000 | 123.300000 |
| 908 | 2017-03-27 13:01:38 | 26.120000 | 100.000000 | 121.560000 |
| 7281 | 2017-01-01 03:02:53 | 0.000000 | 100.000000 | 120.960000 |
| 18130 | 2017-10-26 14:45:01 | 30.500000 | 90.500000 | 119.310000 |
| 13621 | 2017-11-04 13:32:14 | 19.800000 | 105.000000 | 115.940000 |
| 13359 | 2017-01-12 07:19:36 | 0.000000 | 75.000000 | 111.950000 |
| 29 | 2017-11-06 20:30:50 | 30.830000 | 80.000000 | 111.380000 |
| 18888 | 2017-11-04 12:22:33 | 17.980000 | 73.500000 | 110.160000 |
| 11839 | 2017-09-19 16:33:48 | 2.700000 | 99.000000 | 110.000000 |
| 11863 | 2017-08-24 19:44:41 | 14.100000 | 80.000000 | 108.950000 |
| 5271 | 2017-12-07 13:48:52 | 17.960000 | 70.000000 | 107.280000 |
| 5536 | 2017-03-16 12:14:51 | 17.500000 | 69.500000 | 106.600000 |
data.select_dtypes(include = 'number').sort_values(by="total_amount", ascending= False)[:25].style.background_gradient()
| Unnamed: 0 | VendorID | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | duration_secs | duration_mins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8476 | 11157412 | 1 | 1 | 2.600000 | 5 | 226 | 226 | 1 | 999.990000 | 0.000000 | 0.000000 | 200.000000 | 0.000000 | 0.300000 | 1200.290000 | 58.000000 | 0.966667 |
| 20312 | 107558404 | 2 | 2 | 0.000000 | 5 | 265 | 265 | 2 | 450.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.300000 | 450.300000 | 9.000000 | 0.150000 |
| 13861 | 40523668 | 2 | 1 | 33.920000 | 5 | 229 | 265 | 1 | 200.010000 | 0.000000 | 0.500000 | 51.640000 | 5.760000 | 0.300000 | 258.210000 | 3609.000000 | 60.150000 |
| 12511 | 107108848 | 2 | 1 | 0.000000 | 5 | 265 | 265 | 1 | 175.000000 | 0.000000 | 0.000000 | 46.690000 | 11.750000 | 0.300000 | 233.740000 | 18.000000 | 0.300000 |
| 15474 | 55538852 | 2 | 1 | 0.000000 | 5 | 265 | 265 | 1 | 200.000000 | 0.000000 | 0.500000 | 11.000000 | 0.000000 | 0.300000 | 211.800000 | 5.000000 | 0.083333 |
| 6064 | 49894023 | 2 | 1 | 32.720000 | 3 | 138 | 1 | 1 | 107.000000 | 0.000000 | 0.000000 | 55.500000 | 16.260000 | 0.300000 | 179.060000 | 4049.000000 | 67.483333 |
| 16379 | 101198443 | 2 | 1 | 25.500000 | 5 | 132 | 265 | 2 | 140.000000 | 0.000000 | 0.500000 | 0.000000 | 16.260000 | 0.300000 | 157.060000 | 3034.000000 | 50.566667 |
| 3582 | 111653084 | 1 | 1 | 7.300000 | 5 | 1 | 1 | 1 | 152.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.300000 | 152.300000 | 41.000000 | 0.683333 |
| 11269 | 51920669 | 1 | 2 | 0.000000 | 5 | 265 | 265 | 1 | 120.000000 | 0.000000 | 0.000000 | 20.000000 | 11.520000 | 0.300000 | 151.820000 | 55.000000 | 0.916667 |
| 9280 | 51810714 | 2 | 2 | 33.960000 | 5 | 132 | 265 | 2 | 150.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.300000 | 150.300000 | 2353.000000 | 39.216667 |
| 1928 | 51087145 | 1 | 2 | 12.500000 | 5 | 211 | 265 | 1 | 120.000000 | 0.000000 | 0.000000 | 5.000000 | 12.500000 | 0.300000 | 137.800000 | 2922.000000 | 48.700000 |
| 10291 | 76319330 | 2 | 1 | 31.950000 | 4 | 138 | 265 | 2 | 131.000000 | 0.000000 | 0.500000 | 0.000000 | 0.000000 | 0.300000 | 131.800000 | 2274.000000 | 37.900000 |
| 6708 | 91660295 | 2 | 1 | 0.320000 | 5 | 264 | 83 | 1 | 100.000000 | 0.000000 | 0.500000 | 25.200000 | 0.000000 | 0.300000 | 126.000000 | 3.000000 | 0.050000 |
| 11608 | 107690629 | 2 | 2 | 23.000000 | 3 | 151 | 1 | 1 | 99.500000 | 1.000000 | 0.000000 | 10.000000 | 12.500000 | 0.300000 | 123.300000 | 6060.000000 | 101.000000 |
| 908 | 25075013 | 2 | 2 | 26.120000 | 4 | 138 | 265 | 1 | 100.000000 | 0.000000 | 0.500000 | 15.000000 | 5.760000 | 0.300000 | 121.560000 | 2226.000000 | 37.100000 |
| 7281 | 111091850 | 2 | 1 | 0.000000 | 5 | 265 | 265 | 1 | 100.000000 | 0.000000 | 0.500000 | 20.160000 | 0.000000 | 0.300000 | 120.960000 | 9.000000 | 0.150000 |
| 18130 | 90375786 | 1 | 1 | 30.500000 | 1 | 132 | 220 | 1 | 90.500000 | 0.000000 | 0.500000 | 19.850000 | 8.160000 | 0.300000 | 119.310000 | 5268.000000 | 87.800000 |
| 13621 | 93330154 | 1 | 2 | 19.800000 | 5 | 265 | 230 | 1 | 105.000000 | 0.000000 | 0.000000 | 8.000000 | 2.640000 | 0.300000 | 115.940000 | 2796.000000 | 46.600000 |
| 13359 | 3055315 | 1 | 1 | 0.000000 | 5 | 1 | 1 | 1 | 75.000000 | 0.000000 | 0.000000 | 18.650000 | 18.000000 | 0.300000 | 111.950000 | 20.000000 | 0.333333 |
| 29 | 94052446 | 2 | 1 | 30.830000 | 1 | 132 | 23 | 1 | 80.000000 | 0.500000 | 0.500000 | 18.560000 | 11.520000 | 0.300000 | 111.380000 | 12550.000000 | 209.166667 |
| 18888 | 93297612 | 2 | 6 | 17.980000 | 3 | 230 | 1 | 1 | 73.500000 | 0.000000 | 0.000000 | 18.360000 | 18.000000 | 0.300000 | 110.160000 | 2746.000000 | 45.766667 |
| 11839 | 78875919 | 1 | 4 | 2.700000 | 5 | 231 | 265 | 1 | 99.000000 | 0.000000 | 0.000000 | 10.700000 | 0.000000 | 0.300000 | 110.000000 | 1952.000000 | 32.533333 |
| 11863 | 71564944 | 1 | 1 | 14.100000 | 5 | 48 | 265 | 1 | 80.000000 | 0.000000 | 0.000000 | 18.150000 | 10.500000 | 0.300000 | 108.950000 | 2145.000000 | 35.750000 |
| 5271 | 103571464 | 2 | 1 | 17.960000 | 3 | 164 | 1 | 1 | 70.000000 | 0.000000 | 0.000000 | 17.880000 | 19.100000 | 0.300000 | 107.280000 | 2415.000000 | 40.250000 |
| 5536 | 21688416 | 1 | 1 | 17.500000 | 3 | 164 | 1 | 1 | 69.500000 | 0.000000 | 0.000000 | 21.300000 | 15.500000 | 0.300000 | 106.600000 | 2265.000000 | 37.750000 |
%matplotlib inline
Client request: It would be really helpful if you can create meaningful variables by combining or modifying the structures given. A summary of the data visualization
Create target variable - Duration
data.tpep_dropoff_datetime = pd.to_datetime(data.tpep_dropoff_datetime)
data.tpep_pickup_datetime = pd.to_datetime(data.tpep_pickup_datetime)
data["duration_secs"] = (data.tpep_dropoff_datetime - data.tpep_pickup_datetime)/pd.Timedelta(seconds = 1)
data["duration_mins"] = (data.tpep_dropoff_datetime - data.tpep_pickup_datetime)/pd.Timedelta(minutes = 1)
data["week"] = data.tpep_pickup_datetime.dt.isocalendar().week
data["day"] = data.tpep_pickup_datetime.dt.day_name().str.slice(stop = 3)
data["month"] = data.tpep_pickup_datetime.dt.month_name().str.slice(stop = 3)
data["tpep_pickup_time"] = data["tpep_pickup_datetime"].dt.time
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
data.month = pd.Categorical(data.month, categories = months, ordered = True)
data["month_num"] = data.month.cat.codes
data.day = pd.Categorical(data.day, categories = days, ordered = True)
data["day_num"] = data.day.cat.codes
data.sample(3)
| Unnamed: 0 | VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | store_and_fwd_flag | PULocationID | DOLocationID | ... | improvement_surcharge | total_amount | duration_secs | duration_mins | week | day | month | tpep_pickup_time | month_num | day_num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15046 | 55154879 | 1 | 2017-06-30 23:48:43 | 2017-06-30 23:58:49 | 2 | 1.80 | 1 | N | 164 | 230 | ... | 0.3 | 12.35 | 606.0 | 10.100000 | 26 | Fri | Jun | 23:48:43 | 5 | 4 |
| 5433 | 85913300 | 2 | 2017-10-12 16:00:27 | 2017-10-12 16:17:57 | 1 | 1.94 | 1 | N | 186 | 125 | ... | 0.3 | 14.30 | 1050.0 | 17.500000 | 41 | Thu | Oct | 16:00:27 | 9 | 3 |
| 8726 | 86820791 | 2 | 2017-10-15 09:37:14 | 2017-10-15 09:38:42 | 5 | 0.57 | 2 | N | 164 | 230 | ... | 0.3 | 58.56 | 88.0 | 1.466667 | 41 | Sun | Oct | 09:37:14 | 9 | 6 |
3 rows × 26 columns
Divide 24 hour day into categories
data["hour"] = data["tpep_pickup_datetime"].dt.strftime("%H").astype(int)
conditions = [(data['hour'] >= 1) & (data['hour'] < 5),
(data['hour'] >= 5) & (data['hour'] < 9), (data['hour'] >= 9) & (data['hour'] < 13),
(data['hour'] >= 13) & (data['hour'] < 17),(data['hour'] >= 17) & (data['hour'] < 21),
(data['hour'] >= 21), (data['hour'] < 1)
]
# create a list of the values we want to assign for each condition
values = ['Night owls', 'Morning people', 'My people', "Afternoon rush", "night travellers", 'Late night', "Late night"]
values_ = ["01:00 - 05:00", "05:00 - 09:00", "09:00 - 13:00", "13:00 - 17:00", "17:00 - 21:00", "21:00 - 01:00", "21:00 - 01:00"]
# create a new column and use np.select to assign values to it using our lists as arguments
data['period_of_day'] = np.select(conditions, values)
data['time_of_day'] = np.select(conditions, values_)
#del values, values_
#data.drop("hour", axis = 1, inplace = True)
#data.sample(5)
data.time_of_day.value_counts()
17:00 - 21:00 5414 13:00 - 17:00 4650 09:00 - 13:00 4354 21:00 - 01:00 4237 05:00 - 09:00 2611 01:00 - 05:00 1433 Name: time_of_day, dtype: int64
conditions = [(data['hour'] >= 6) & (data['hour'] < 10),
(data['hour'] >= 10) & (data['hour'] < 16),
(data['hour'] >= 16) & (data['hour'] < 20),
(data['hour'] >= 20), (data['hour'] < 6) ]
values = ['Morning Rush', 'Day-lighers', 'Evening Rush', "Night owls","Night owls"]
data['period_of_day2'] = np.select(conditions, values)
data['period_of_day2'].value_counts()
Night owls 7200 Day-lighers 6760 Evening Rush 5251 Morning Rush 3488 Name: period_of_day2, dtype: int64
Payment types
data['payment_cats'] = data.payment_type.replace({1:'Credit card',2:'Cash', 3: 'No charge', 4:'Dispute', 5:'Unknown', 6:'Voided trip'})
data['payment_cats'].value_counts()
Credit card 15265 Cash 7267 No charge 121 Dispute 46 Name: payment_cats, dtype: int64
More time variables
data['minutes'] = data["tpep_pickup_datetime"].dt.strftime("%M").astype(int)
data["time"] = data.hour + data.minutes/60
data[["tpep_pickup_datetime", "hour", "minutes", "time"]].sample(5)
| tpep_pickup_datetime | hour | minutes | time | |
|---|---|---|---|---|
| 16948 | 2017-05-29 11:28:46 | 11 | 28 | 11.466667 |
| 5688 | 2017-05-15 11:44:53 | 11 | 44 | 11.733333 |
| 13631 | 2017-01-02 19:16:30 | 19 | 16 | 19.266667 |
| 20014 | 2017-12-23 12:24:59 | 12 | 24 | 12.400000 |
| 1824 | 2017-03-22 10:13:02 | 10 | 13 | 10.216667 |
Time-based data visualizations
temp = data.drop(["Unnamed: 0", "VendorID", "RatecodeID", "PULocationID", "DOLocationID", "payment_type", "week"],
axis = 1).groupby(pd.Grouper(key = "tpep_pickup_datetime", freq="4H")).sum(numeric_only = True
).sort_values(by = "duration_secs", ascending = False).reset_index().copy(deep = True)
temp.tpep_pickup_datetime = temp.tpep_pickup_datetime.astype('string').str.slice(start = 11)
temp
| tpep_pickup_datetime | passenger_count | trip_distance | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | duration_secs | duration_mins | month_num | day_num | hour | minutes | time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20:00:00 | 24 | 38.09 | 175.0 | 8.0 | 8.0 | 25.36 | 0.00 | 4.8 | 221.16 | 183111.0 | 3051.850000 | 80 | 64 | 338 | 575 | 347.583333 |
| 1 | 08:00:00 | 18 | 41.33 | 174.0 | 0.0 | 6.0 | 24.29 | 5.76 | 3.6 | 213.65 | 178483.0 | 2974.716667 | 132 | 24 | 113 | 346 | 118.766667 |
| 2 | 16:00:00 | 26 | 90.00 | 348.5 | 0.0 | 7.5 | 50.74 | 5.76 | 4.5 | 417.00 | 109384.0 | 1823.066667 | 45 | 90 | 261 | 539 | 269.983333 |
| 3 | 20:00:00 | 38 | 94.81 | 343.0 | 8.5 | 9.0 | 68.45 | 13.68 | 5.4 | 448.03 | 109372.0 | 1822.866667 | 54 | 54 | 389 | 493 | 397.216667 |
| 4 | 16:00:00 | 44 | 41.93 | 243.0 | 18.5 | 9.5 | 21.19 | 5.76 | 5.7 | 303.65 | 106113.0 | 1768.550000 | 76 | 38 | 335 | 662 | 346.033333 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2185 | 04:00:00 | 0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.000000 | 0 | 0 | 0 | 0 | 0.000000 |
| 2186 | 04:00:00 | 0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.000000 | 0 | 0 | 0 | 0 | 0.000000 |
| 2187 | 00:00:00 | 0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.000000 | 0 | 0 | 0 | 0 | 0.000000 |
| 2188 | 00:00:00 | 0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.000000 | 0 | 0 | 0 | 0 | 0.000000 |
| 2189 | 04:00:00 | 0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.000000 | 0 | 0 | 0 | 0 | 0.000000 |
2190 rows × 17 columns
# Taxi rides collectively take long around 16h00, evening traffic?
sb.barplot(temp.groupby("tpep_pickup_datetime").sum(numeric_only = True).reset_index(), y = "duration_mins", x = "tpep_pickup_datetime")
<AxesSubplot: xlabel='tpep_pickup_datetime', ylabel='duration_mins'>
# Best time to make money is around that 16h00 period, earning over $185,
# compared to around $70 around 04hoo or $170 around noon
sb.barplot(temp.groupby("tpep_pickup_datetime").mean(numeric_only = True).reset_index(), y = "fare_amount", x = "tpep_pickup_datetime")
<AxesSubplot: xlabel='tpep_pickup_datetime', ylabel='fare_amount'>
Average nums for each day of the month
data.drop(["Unnamed: 0", "VendorID", "RatecodeID", "PULocationID", "DOLocationID", "payment_type", "week"], axis = 1).groupby(["month", "day"]).mean(numeric_only = True).reset_index().iloc[0:10,].style.background_gradient()
| month | day | passenger_count | trip_distance | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | duration_secs | duration_mins | month_num | day_num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Jan | Mon | 1.511864 | 2.915085 | 12.216949 | 0.276271 | 0.498305 | 1.737322 | 0.424475 | 0.300000 | 15.453322 | 750.881356 | 12.514689 | 0.000000 | 0.000000 |
| 1 | Jan | Tue | 1.702703 | 2.849610 | 12.965465 | 0.354354 | 0.495495 | 1.838949 | 0.409429 | 0.298198 | 16.373604 | 849.798799 | 14.163313 | 0.000000 | 1.000000 |
| 2 | Jan | Wed | 1.696000 | 2.492000 | 11.782000 | 0.398000 | 0.500000 | 1.697320 | 0.187440 | 0.300000 | 14.872560 | 810.228000 | 13.503800 | 0.000000 | 2.000000 |
| 3 | Jan | Thu | 1.663082 | 2.870466 | 12.978495 | 0.394265 | 0.496416 | 2.029534 | 0.327742 | 0.300000 | 16.544194 | 858.329749 | 14.305496 | 0.000000 | 3.000000 |
| 4 | Jan | Fri | 1.676923 | 2.785577 | 12.648077 | 0.409615 | 0.498077 | 1.760577 | 0.274769 | 0.300000 | 15.891115 | 853.519231 | 14.225321 | 0.000000 | 4.000000 |
| 5 | Jan | Sat | 1.793358 | 2.629889 | 11.934317 | 0.190037 | 0.498155 | 1.718450 | 0.224871 | 0.300000 | 14.865830 | 775.937269 | 12.932288 | 0.000000 | 5.000000 |
| 6 | Jan | Sun | 1.647249 | 3.192621 | 13.839806 | 0.199029 | 0.498382 | 1.734466 | 0.340647 | 0.300000 | 16.924951 | 875.297735 | 14.588296 | 0.000000 | 6.000000 |
| 7 | Feb | Mon | 1.682540 | 3.197989 | 17.998360 | 0.335979 | 0.494709 | 2.806508 | 0.234497 | 0.300000 | 22.196243 | 762.687831 | 12.711464 | 1.000000 | 0.000000 |
| 8 | Feb | Tue | 1.591440 | 2.326965 | 11.313230 | 0.451362 | 0.500000 | 1.615759 | 0.160778 | 0.300000 | 14.348716 | 773.027237 | 12.883787 | 1.000000 | 1.000000 |
| 9 | Feb | Wed | 1.553030 | 3.285227 | 13.717803 | 0.437500 | 0.500000 | 2.124659 | 0.461667 | 0.300000 | 17.549015 | 894.037879 | 14.900631 | 1.000000 | 2.000000 |
Total nums for each day of the month
data.drop(["Unnamed: 0", "VendorID", "RatecodeID", "PULocationID", "DOLocationID", "payment_type", "week"], axis = 1).groupby(["month", "day"]).sum(numeric_only = True).reset_index().iloc[0:10,].style.background_gradient()
| month | day | passenger_count | trip_distance | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | duration_secs | duration_mins | month_num | day_num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Jan | Mon | 446 | 859.950000 | 3604.000000 | 81.500000 | 147.000000 | 512.510000 | 125.220000 | 88.500000 | 4558.730000 | 221510.000000 | 3691.833333 | 0 | 0 |
| 1 | Jan | Tue | 567 | 948.920000 | 4317.500000 | 118.000000 | 165.000000 | 612.370000 | 136.340000 | 99.300000 | 5452.410000 | 282983.000000 | 4716.383333 | 0 | 333 |
| 2 | Jan | Wed | 424 | 623.000000 | 2945.500000 | 99.500000 | 125.000000 | 424.330000 | 46.860000 | 75.000000 | 3718.140000 | 202557.000000 | 3375.950000 | 0 | 500 |
| 3 | Jan | Thu | 464 | 800.860000 | 3621.000000 | 110.000000 | 138.500000 | 566.240000 | 91.440000 | 83.700000 | 4615.830000 | 239474.000000 | 3991.233333 | 0 | 837 |
| 4 | Jan | Fri | 436 | 724.250000 | 3288.500000 | 106.500000 | 129.500000 | 457.750000 | 71.440000 | 78.000000 | 4131.690000 | 221915.000000 | 3698.583333 | 0 | 1040 |
| 5 | Jan | Sat | 486 | 712.700000 | 3234.200000 | 51.500000 | 135.000000 | 465.700000 | 60.940000 | 81.300000 | 4028.640000 | 210279.000000 | 3504.650000 | 0 | 1355 |
| 6 | Jan | Sun | 509 | 986.520000 | 4276.500000 | 61.500000 | 154.000000 | 535.950000 | 105.260000 | 92.700000 | 5229.810000 | 270467.000000 | 4507.783333 | 0 | 1854 |
| 7 | Feb | Mon | 318 | 604.420000 | 3401.690000 | 63.500000 | 93.500000 | 530.430000 | 44.320000 | 56.700000 | 4195.090000 | 144148.000000 | 2402.466667 | 189 | 0 |
| 8 | Feb | Tue | 409 | 598.030000 | 2907.500000 | 116.000000 | 128.500000 | 415.250000 | 41.320000 | 77.100000 | 3687.620000 | 198668.000000 | 3311.133333 | 257 | 257 |
| 9 | Feb | Wed | 410 | 867.300000 | 3621.500000 | 115.500000 | 132.000000 | 560.910000 | 121.880000 | 79.200000 | 4632.940000 | 236026.000000 | 3933.766667 | 264 | 528 |
Average nums for each day of the week
data.drop(["Unnamed: 0", "VendorID", "RatecodeID", "PULocationID", "DOLocationID", "payment_type", "week"], axis = 1).groupby(["day"]).mean(numeric_only = True).reset_index().iloc[0:10,].style.background_gradient()
| day | passenger_count | trip_distance | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | duration_secs | duration_mins | month_num | day_num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Mon | 1.620607 | 3.064715 | 13.432852 | 0.375299 | 0.497271 | 1.935827 | 0.370051 | 0.299488 | 16.913808 | 925.504606 | 15.425077 | 5.486182 | 0.000000 |
| 1 | Tue | 1.619137 | 2.805744 | 13.052908 | 0.390557 | 0.497811 | 1.865009 | 0.315710 | 0.299625 | 16.424997 | 916.036898 | 15.267282 | 5.367730 | 1.000000 |
| 2 | Wed | 1.611209 | 2.827032 | 12.886655 | 0.397050 | 0.498378 | 1.916667 | 0.310519 | 0.299558 | 16.315773 | 1090.023599 | 18.167060 | 5.502950 | 2.000000 |
| 3 | Thu | 1.608172 | 2.922404 | 13.373560 | 0.390212 | 0.496620 | 1.900791 | 0.338586 | 0.299559 | 16.808322 | 1113.103762 | 18.551729 | 5.529101 | 3.000000 |
| 4 | Fri | 1.632288 | 2.817017 | 12.980021 | 0.393788 | 0.497656 | 1.842406 | 0.335775 | 0.299648 | 16.354744 | 1072.891591 | 17.881527 | 5.374158 | 4.000000 |
| 5 | Sat | 1.712504 | 2.812043 | 12.350760 | 0.194684 | 0.497178 | 1.638966 | 0.222257 | 0.299465 | 15.205049 | 975.732997 | 16.262217 | 5.334125 | 5.000000 |
| 6 | Sun | 1.694797 | 3.190644 | 13.178165 | 0.181121 | 0.497165 | 1.755060 | 0.300617 | 0.299500 | 16.218833 | 1034.213476 | 17.236891 | 5.360574 | 6.000000 |
Average nums for each month of the year
data.drop(["Unnamed: 0", "VendorID", "RatecodeID", "PULocationID", "DOLocationID", "payment_type", "week"], axis = 1).groupby(["month"]).mean(numeric_only = True).reset_index().iloc[0:10,].style.background_gradient()
| month | passenger_count | trip_distance | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | duration_secs | duration_mins | month_num | day_num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Jan | 1.668503 | 2.832349 | 12.662594 | 0.314722 | 0.497747 | 1.790110 | 0.319229 | 0.299700 | 15.891462 | 825.831247 | 13.763854 | 0.000000 | 2.963946 |
| 1 | Feb | 1.645562 | 2.831933 | 13.005195 | 0.338609 | 0.497739 | 1.934585 | 0.276439 | 0.299661 | 16.358332 | 904.775014 | 15.079584 | 1.000000 | 3.140192 |
| 2 | Mar | 1.618350 | 2.883324 | 12.903631 | 0.345778 | 0.496828 | 1.810137 | 0.288180 | 0.299414 | 16.147335 | 983.993655 | 16.399894 | 2.000000 | 3.160078 |
| 3 | Apr | 1.601288 | 2.934656 | 12.647945 | 0.328133 | 0.497524 | 1.780248 | 0.298524 | 0.299406 | 15.855641 | 1128.545319 | 18.809089 | 3.000000 | 3.212977 |
| 4 | May | 1.643318 | 3.013130 | 13.397923 | 0.346001 | 0.498510 | 1.943835 | 0.308803 | 0.299702 | 16.805057 | 1037.839046 | 17.297317 | 4.000000 | 2.861401 |
| 5 | Jun | 1.664969 | 2.977001 | 13.468814 | 0.347505 | 0.495927 | 1.839027 | 0.300621 | 0.299542 | 16.761976 | 1157.679735 | 19.294662 | 5.000000 | 2.996945 |
| 6 | Jul | 1.706541 | 2.820029 | 12.588863 | 0.343253 | 0.496759 | 1.656730 | 0.293235 | 0.299293 | 15.685115 | 1081.131408 | 18.018857 | 6.000000 | 3.020035 |
| 7 | Aug | 1.680394 | 2.985226 | 12.925180 | 0.342807 | 0.498260 | 1.714292 | 0.316334 | 0.299826 | 16.101833 | 897.383991 | 14.956400 | 7.000000 | 2.803364 |
| 8 | Sep | 1.625144 | 2.941488 | 13.011534 | 0.325548 | 0.497982 | 1.806943 | 0.322745 | 0.299654 | 16.266655 | 1013.831027 | 16.897184 | 8.000000 | 3.085352 |
| 9 | Oct | 1.581154 | 2.890656 | 12.967859 | 0.337444 | 0.497287 | 1.848757 | 0.360829 | 0.299556 | 16.312694 | 922.344351 | 15.372406 | 9.000000 | 2.923532 |
Total duration for each hour for every day of the week
data[['month', "day", "hour",'duration_secs']].groupby(["day", "hour"]).sum(numeric_only = True).reset_index().pivot(columns = "day", index = "hour", values = "duration_secs").style.background_gradient(cmap="Reds")
C:\Users\Jason\AppData\Local\Temp\ipykernel_14164\2620107525.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. data[['month', "day", "hour",'duration_secs']].groupby(["day", "hour"]).sum().reset_index().pivot(columns = "day", index = "hour", values = "duration_secs").style.background_gradient(cmap="Reds")
| day | Mon | Tue | Wed | Thu | Fri | Sat | Sun |
|---|---|---|---|---|---|---|---|
| hour | |||||||
| 0 | 38862.000000 | 40722.000000 | 52228.000000 | 62367.000000 | 94209.000000 | 204469.000000 | 147788.000000 |
| 1 | 22787.000000 | 24134.000000 | 22432.000000 | 29200.000000 | 46065.000000 | 191805.000000 | 184829.000000 |
| 2 | 19623.000000 | 18327.000000 | 15190.000000 | 25536.000000 | 37076.000000 | 164229.000000 | 150645.000000 |
| 3 | 19302.000000 | 19034.000000 | 8919.000000 | 14315.000000 | 12479.000000 | 53133.000000 | 82270.000000 |
| 4 | 6443.000000 | 15439.000000 | 9220.000000 | 21398.000000 | 20011.000000 | 41307.000000 | 51770.000000 |
| 5 | 29766.000000 | 11309.000000 | 23688.000000 | 29275.000000 | 27235.000000 | 21588.000000 | 21260.000000 |
| 6 | 62143.000000 | 43773.000000 | 58133.000000 | 68603.000000 | 62394.000000 | 25789.000000 | 16026.000000 |
| 7 | 98225.000000 | 99796.000000 | 130642.000000 | 115997.000000 | 186094.000000 | 41691.000000 | 109834.000000 |
| 8 | 134528.000000 | 161328.000000 | 314016.000000 | 164927.000000 | 153944.000000 | 42419.000000 | 44621.000000 |
| 9 | 144429.000000 | 150299.000000 | 191268.000000 | 157763.000000 | 249017.000000 | 93568.000000 | 144867.000000 |
| 10 | 134091.000000 | 139957.000000 | 155439.000000 | 207698.000000 | 241140.000000 | 84865.000000 | 93034.000000 |
| 11 | 147551.000000 | 162668.000000 | 240542.000000 | 284291.000000 | 165428.000000 | 194987.000000 | 104397.000000 |
| 12 | 119534.000000 | 127735.000000 | 152722.000000 | 181283.000000 | 170374.000000 | 134771.000000 | 153530.000000 |
| 13 | 126219.000000 | 214413.000000 | 173703.000000 | 159368.000000 | 128368.000000 | 141820.000000 | 245732.000000 |
| 14 | 246133.000000 | 208297.000000 | 138133.000000 | 199687.000000 | 163898.000000 | 253579.000000 | 167933.000000 |
| 15 | 149031.000000 | 181703.000000 | 201264.000000 | 178007.000000 | 194256.000000 | 146213.000000 | 231953.000000 |
| 16 | 146979.000000 | 236633.000000 | 156146.000000 | 195261.000000 | 170941.000000 | 146870.000000 | 164651.000000 |
| 17 | 118572.000000 | 168109.000000 | 194816.000000 | 274634.000000 | 204962.000000 | 163509.000000 | 260220.000000 |
| 18 | 194150.000000 | 239935.000000 | 296015.000000 | 197051.000000 | 188136.000000 | 192176.000000 | 146597.000000 |
| 19 | 158315.000000 | 156002.000000 | 260556.000000 | 198013.000000 | 269069.000000 | 177685.000000 | 220487.000000 |
| 20 | 129501.000000 | 148751.000000 | 250293.000000 | 256430.000000 | 309608.000000 | 232319.000000 | 111856.000000 |
| 21 | 295615.000000 | 138778.000000 | 281148.000000 | 189172.000000 | 172752.000000 | 220166.000000 | 85711.000000 |
| 22 | 99699.000000 | 129134.000000 | 257161.000000 | 162954.000000 | 246186.000000 | 159164.000000 | 94739.000000 |
| 23 | 71156.000000 | 93210.000000 | 111506.000000 | 413549.000000 | 148137.000000 | 157171.000000 | 65822.000000 |
Bunch of heatmaps
plt.figure(figsize=(11, 11))
sb.heatmap(data[['month', "day", "hour",'duration_mins']].groupby(["day", "hour"]).sum(numeric_only = True).reset_index().pivot(columns = "day", index = "hour", values = "duration_mins"))
plt.title("Total taxi trip duration (in minutes) for each 4-hour interval everyday")
C:\Users\Jason\AppData\Local\Temp\ipykernel_14164\777194791.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. sb.heatmap(data[['month', "day", "hour",'duration_mins']].groupby(["day", "hour"]).sum().reset_index().pivot(columns = "day", index = "hour", values = "duration_mins"))
Text(0.5, 1.0, 'Total taxi trip duration (in minutes) for each 4-hour interval everyday')
from matplotlib import pyplot as plt
plt.figure(figsize=(11, 7))
sb.heatmap(data[['month', "day", "time_of_day",'duration_mins']].groupby(["day", "time_of_day"]).sum().reset_index().pivot(columns = "day", index = "time_of_day", values = "duration_mins"))
plt.title("Total taxi trip duration (in minutes) for each 4-hour interval everyday")
C:\Users\Jason\AppData\Local\Temp\ipykernel_14164\3928804222.py:4: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. sb.heatmap(data[['month', "day", "time_of_day",'duration_mins']].groupby(["day", "time_of_day"]).sum().reset_index().pivot(columns = "day", index = "time_of_day", values = "duration_mins"))
Text(0.5, 1.0, 'Total taxi trip duration (in seconds) for each 4-hour interval everyday')
data.columns
Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
'tolls_amount', 'improvement_surcharge', 'total_amount',
'duration_secs', 'duration_mins', 'week', 'day', 'month', 'month_num',
'day_num', 'tpep_pickup_time', 'time_of_day', 'period_of_day', 'hour'],
dtype='object')
# plot line graph on axis #1
ax1 = sb.lineplot(
x='tpep_dropoff_datetime',
y='duration_mins',
data=data.groupby("tpep_dropoff_datetime").sum(numeric_only = True).reset_index(),
sort=False,
color='blue'
)
ax1.set_ylabel('duration')
#ax1.set_ylim(0, 25)
ax1.legend(['duration'], loc="upper left")
# set up the 2nd axis
ax2 = ax1.twinx()
# plot bar graph on axis #2
sb.lineplot(
x = 'tpep_dropoff_datetime',
y = 'fare_amount',
data = data.groupby("tpep_dropoff_datetime").sum(numeric_only = True).reset_index(),
color ='orange',
alpha = 0.5,
ax = ax2 # Pre-existing axes for the plot
)
ax2.grid(b = False) # turn off grid #2
ax2.set_ylabel('Fare Amount')
#ax2.set_ylim(0, 90)
ax2.legend(['Fare Amount'], loc = "upper right")
plt.show()
C:\Users\Jason\AppData\Local\Temp\ipykernel_14164\1151285376.py:23: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later. ax2.grid(b = False) # turn off grid #2
Distribution of trip durations (in seconds) per month
Excludes outliers
plt.figure(figsize = (9,6))
sb.boxplot(data = data[data.duration_secs >= 0], y = "duration_secs", x = "month", showfliers = False)
<AxesSubplot: xlabel='month', ylabel='duration_secs'>
plt.figure(figsize = (9,6))
sb.lineplot(data = data[data.duration_secs >= 0], y = "duration_secs", x = "month")
<AxesSubplot: xlabel='month', ylabel='duration_secs'>
I made these functions to use to label encode and one hot encode easily, ended up not encoding anything so I ended up not using them. I'll just keep them for future =D
def one_hot_enc(dataset: pd.DataFrame, variables:list or str or tuple):
'''
def one_hot_enc(df, labels)
Perform one-hot encoding on the provided dataset. Avoid having to manually one-hot encode each variable.
# Parameter:
dataset: Pandas DataFrame
The full dataset to be updated with encoded variables
variables: single label or list-like.
The variables to be replaced with one-hot encoded versions
#Returns:
Dataset with the one-hot encoded variables
-> Dataframe'''
output = pd.DataFrame()
if (type(data) == pd.core.frame.DataFrame) and (type(variables) == str):
return pd.get_dummies(data[variables])
elif (type(data) == pd.core.frame.DataFrame) and (type(variables) in [list, tuple]):
for variable in variables:
if output.empty:
output = pd.get_dummies(data[variable])
else:
output = pd.concat([output, pd.get_dummies(dataset[variable])], axis = 1)
else:
raise TypeError("You should use pandas dataframes with string or list-like object containing column names")
return output
def label_enc(dataset: pd.DataFrame, variables: list or tuple or str):
'''
def label_enc (df, labels)
Perform label encoding on the provided dataset
# Parameter:
------------
dataset: pandas dataframe
The full dataset to be updated with encoded variables
variables - single label or list-like.
The variables to be replaced with label encoded versions
# Returns:
-----------
Dataset: pandas DataFrame
Dataset with the label encoded variables
'''
output = pd.DataFrame()
if (type(data) == pd.core.frame.DataFrame) and (type(variables) == str):
print(1)
return data[variables].cat.codes
elif (type(data) == pd.core.frame.DataFrame) and (type(variables) in [list, tuple]):
for variable in variables:
if output.empty:
output = data[variable].cat.codes
print(2)
else:
output = pd.concat([output, data[variable].cat.codes], axis = 1)
print(3)
else:
raise TypeError("You should use pandas dataframes with string or list-like object containing column names")
return output
Client request: Provide a summary of the column Dtypes, data value nonnull counts, relevant and irrelevant columns, along with anything else code related you think is worth showing in the notebook?
data.describe()
| VendorID | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 | 22699.000000 |
| mean | 1.556236 | 1.642319 | 2.913313 | 1.043394 | 162.412353 | 161.527997 | 1.336887 | 13.026629 | 0.333275 | 0.497445 | 1.835781 | 0.312542 | 0.299551 | 16.310502 |
| std | 0.496838 | 1.285231 | 3.653171 | 0.708391 | 66.633373 | 70.139691 | 0.496211 | 13.243791 | 0.463097 | 0.039465 | 2.800626 | 1.399212 | 0.015673 | 16.097295 |
| min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | -120.000000 | -1.000000 | -0.500000 | 0.000000 | 0.000000 | -0.300000 | -120.300000 |
| 25% | 1.000000 | 1.000000 | 0.990000 | 1.000000 | 114.000000 | 112.000000 | 1.000000 | 6.500000 | 0.000000 | 0.500000 | 0.000000 | 0.000000 | 0.300000 | 8.750000 |
| 50% | 2.000000 | 1.000000 | 1.610000 | 1.000000 | 162.000000 | 162.000000 | 1.000000 | 9.500000 | 0.000000 | 0.500000 | 1.350000 | 0.000000 | 0.300000 | 11.800000 |
| 75% | 2.000000 | 2.000000 | 3.060000 | 1.000000 | 233.000000 | 233.000000 | 2.000000 | 14.500000 | 0.500000 | 0.500000 | 2.450000 | 0.000000 | 0.300000 | 17.800000 |
| max | 2.000000 | 6.000000 | 33.960000 | 99.000000 | 265.000000 | 265.000000 | 4.000000 | 999.990000 | 4.500000 | 0.500000 | 200.000000 | 19.100000 | 0.300000 | 1200.290000 |
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 22699 entries, 24870114 to 17208911 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 VendorID 22699 non-null int64 1 tpep_pickup_datetime 22699 non-null object 2 tpep_dropoff_datetime 22699 non-null object 3 passenger_count 22699 non-null int64 4 trip_distance 22699 non-null float64 5 RatecodeID 22699 non-null int64 6 store_and_fwd_flag 22699 non-null object 7 PULocationID 22699 non-null int64 8 DOLocationID 22699 non-null int64 9 payment_type 22699 non-null int64 10 fare_amount 22699 non-null float64 11 extra 22699 non-null float64 12 mta_tax 22699 non-null float64 13 tip_amount 22699 non-null float64 14 tolls_amount 22699 non-null float64 15 improvement_surcharge 22699 non-null float64 16 total_amount 22699 non-null float64 dtypes: float64(8), int64(6), object(3) memory usage: 3.1+ MB
data.isnull().sum()
VendorID 0 tpep_pickup_datetime 0 tpep_dropoff_datetime 0 passenger_count 0 trip_distance 0 RatecodeID 0 store_and_fwd_flag 0 PULocationID 0 DOLocationID 0 payment_type 0 fare_amount 0 extra 0 mta_tax 0 tip_amount 0 tolls_amount 0 improvement_surcharge 0 total_amount 0 dtype: int64
Money variables - cost of taxi rides
data.groupby(["payment_type"]).agg(["mean", "median", "min", "max"])[["tip_amount", "tolls_amount", "total_amount",]]
C:\Users\Jason\AppData\Local\Temp\ipykernel_16012\2768404590.py:1: FutureWarning: ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning. data.groupby(["payment_type"]).agg(["mean", "median", "min", "max"])[["tip_amount", "tolls_amount", "total_amount",]]
| tip_amount | tolls_amount | total_amount | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | median | min | max | mean | median | min | max | mean | median | min | max | |
| payment_type | ||||||||||||
| 1 | 2.7298 | 2.0 | 0.0 | 200.0 | 0.357659 | 0.0 | 0.0 | 19.10 | 17.663577 | 12.95 | 0.0 | 1200.29 |
| 2 | 0.0000 | 0.0 | 0.0 | 0.0 | 0.214441 | 0.0 | 0.0 | 18.28 | 13.545821 | 9.80 | 0.0 | 450.30 |
| 3 | 0.0000 | 0.0 | 0.0 | 0.0 | 0.388595 | 0.0 | 0.0 | 12.50 | 13.579669 | 8.30 | -5.3 | 78.30 |
| 4 | 0.0000 | 0.0 | 0.0 | 0.0 | 0.638261 | 0.0 | 0.0 | 11.52 | 11.238261 | 9.30 | -120.3 | 64.32 |
Total amount
Money should not be negative, but for some reason the total amount above has negative values. Let's investigate
f"There are {data[data.total_amount < 0].shape[0]} negative values for the total amount paid"
'There are 14 negative values for the total amount paid'
data[data.total_amount < 0].groupby("payment_cats").total_amount.count()
# all the negative amounts come from disputed or uncharged transactions, wonder what happened there
payment_cats Dispute 7 No charge 7 Name: total_amount, dtype: int64
# Lets see these rides!
data[data.total_amount < 0]
| VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | store_and_fwd_flag | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 105454287 | 2 | 12/13/2017 2:02:39 AM | 12/13/2017 2:03:08 AM | 6 | 0.12 | 1 | N | 161 | 161 | 3 | -2.5 | -0.5 | -0.5 | 0.0 | 0.0 | -0.3 | -3.8 |
| 57337183 | 2 | 07/05/2017 11:02:23 AM | 07/05/2017 11:03:00 AM | 1 | 0.04 | 1 | N | 79 | 79 | 3 | -2.5 | 0.0 | -0.5 | 0.0 | 0.0 | -0.3 | -3.3 |
| 97329905 | 2 | 11/16/2017 8:13:30 PM | 11/16/2017 8:14:50 PM | 2 | 0.06 | 1 | N | 237 | 237 | 4 | -3.0 | -0.5 | -0.5 | 0.0 | 0.0 | -0.3 | -4.3 |
| 28459983 | 2 | 04/06/2017 12:50:26 PM | 04/06/2017 12:52:39 PM | 1 | 0.25 | 1 | N | 90 | 68 | 3 | -3.5 | 0.0 | -0.5 | 0.0 | 0.0 | -0.3 | -4.3 |
| 833948 | 2 | 01/03/2017 8:15:23 PM | 01/03/2017 8:15:39 PM | 1 | 0.02 | 1 | N | 170 | 170 | 3 | -2.5 | -0.5 | -0.5 | 0.0 | 0.0 | -0.3 | -3.8 |
| 91187947 | 2 | 10/28/2017 8:39:36 PM | 10/28/2017 8:41:59 PM | 1 | 0.41 | 1 | N | 236 | 237 | 3 | -3.5 | -0.5 | -0.5 | 0.0 | 0.0 | -0.3 | -4.8 |
| 55302347 | 2 | 06/05/2017 5:34:25 PM | 06/05/2017 5:36:29 PM | 2 | 0.00 | 1 | N | 238 | 238 | 4 | -2.5 | -1.0 | -0.5 | 0.0 | 0.0 | -0.3 | -4.3 |
| 58395501 | 2 | 07/09/2017 7:20:59 AM | 07/09/2017 7:23:50 AM | 1 | 0.64 | 1 | N | 50 | 48 | 3 | -4.5 | 0.0 | -0.5 | 0.0 | 0.0 | -0.3 | -5.3 |
| 29059760 | 2 | 04/08/2017 12:00:16 AM | 04/08/2017 11:15:57 PM | 1 | 0.17 | 5 | N | 138 | 138 | 4 | -120.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.3 | -120.3 |
| 109276092 | 2 | 12/24/2017 10:37:58 PM | 12/24/2017 10:41:08 PM | 5 | 0.40 | 1 | N | 164 | 161 | 4 | -4.0 | -0.5 | -0.5 | 0.0 | 0.0 | -0.3 | -5.3 |
| 24690146 | 2 | 03/24/2017 7:31:13 PM | 03/24/2017 7:34:49 PM | 1 | 0.46 | 1 | N | 87 | 45 | 4 | -4.0 | -1.0 | -0.5 | 0.0 | 0.0 | -0.3 | -5.8 |
| 43859760 | 2 | 05/22/2017 3:51:20 PM | 05/22/2017 3:52:22 PM | 1 | 0.10 | 1 | N | 230 | 163 | 3 | -3.0 | 0.0 | -0.5 | 0.0 | 0.0 | -0.3 | -3.8 |
| 75926915 | 2 | 09/09/2017 10:59:51 PM | 09/09/2017 11:02:06 PM | 1 | 0.24 | 1 | N | 116 | 116 | 4 | -3.5 | -0.5 | -0.5 | 0.0 | 0.0 | -0.3 | -4.8 |
| 14668209 | 2 | 02/24/2017 12:38:17 AM | 02/24/2017 12:42:05 AM | 1 | 0.70 | 1 | N | 65 | 25 | 4 | -4.5 | -0.5 | -0.5 | 0.0 | 0.0 | -0.3 | -5.8 |
data[(data.total_amount < 0) & ((data.PULocationID == data.DOLocationID) | (data.trip_distance == 0))].groupby("payment_cats").total_amount.count()
# seven of these uncharged or disputed transactions ended in the same area they started
# I'll sort these out when tackling outliers later
payment_cats Dispute 4 No charge 3 Name: total_amount, dtype: int64
At this point I'm just tryna figure out if there are any patterns we can identify visually. Enjoy the heatmaps! :)
data[['month', "day", 'duration_secs']].groupby(['month', "day"]).sum().reset_index()
| month | day | duration_secs | |
|---|---|---|---|
| 0 | Jan | Mon | 221510.0 |
| 1 | Jan | Tue | 282983.0 |
| 2 | Jan | Wed | 202557.0 |
| 3 | Jan | Thu | 239474.0 |
| 4 | Jan | Fri | 221915.0 |
| ... | ... | ... | ... |
| 79 | Dec | Wed | 596897.0 |
| 80 | Dec | Thu | 533153.0 |
| 81 | Dec | Fri | 344507.0 |
| 82 | Dec | Sat | 236396.0 |
| 83 | Dec | Sun | 293622.0 |
84 rows × 3 columns
plt.figure(figsize=(11, 7))
plt.title("Total durations of taxi trips (in seconds) for everyday of each month")
sb.heatmap(data[['month', "day", 'duration_secs']].groupby(['month', "day"]).sum().reset_index().pivot(columns = "month", index = "day", values = "duration_secs"))
<AxesSubplot: title={'center': 'Total durations of taxi trips (in seconds) for everyday of each month'}, xlabel='month', ylabel='day'>
plt.figure(figsize=(11, 7))
sb.heatmap(data[['month', "day", 'duration_secs']].groupby(['month', "day"]).mean().reset_index().pivot(columns = "month", index = "day", values = "duration_secs"))
plt.title("Average taxi trip duration (in seconds) for everyday of each month")
Text(0.5, 1.0, 'Average taxi trip duration (in seconds) for everyday of each month')
plt.figure(figsize=(12, 7))
plt.title("Average aggregated duration (in seconds) of taxi rides for each day of the month")
sb.barplot(data = data[['month', "day", 'duration_secs']].groupby(['month', "day"]).mean().reset_index(), x = "month", y = "duration_secs", hue = "day")
<AxesSubplot: title={'center': 'Average aggregated duration (in seconds) of taxi rides for each day of the month'}, xlabel='month', ylabel='duration_secs'>
plt.figure(figsize=(12, 7))
plt.title("Total aggregated duration (in seconds) of taxi rides for each day of the month")
sb.barplot(data = data[['month', "day", 'duration_secs']].groupby(['month', "day"]).sum().reset_index(), x = "month", y = "duration_secs", hue = "day")
<AxesSubplot: title={'center': 'Total aggregated duration (in seconds) of taxi rides for each day of the month'}, xlabel='month', ylabel='duration_secs'>
plt.figure(figsize=(12, 7))
plt.title("Distribution of trip duration (in seconds) of taxi rides for each month")
sb.boxplot(data = data[['month', "day", 'duration_secs']], x = "month", y = "duration_secs", showfliers = False)
<AxesSubplot: title={'center': 'Distribution of trip duration (in seconds) of taxi rides for each day of the month'}, xlabel='month', ylabel='duration_secs'>
plt.figure(figsize=(12, 7))
plt.title("Distribution of trip duration (in seconds) of taxi rides for each day of the week")
sb.boxplot(data = data[['month', "day", 'duration_secs']], x = "day", y = "duration_secs", showfliers = False)
<AxesSubplot: title={'center': 'Distribution of trip duration (in seconds) of taxi rides for each day of the week'}, xlabel='day', ylabel='duration_secs'>
plt.figure(figsize=(11, 7))
plt.title("Total durations of taxi passengers for everyday of each month")
sb.heatmap(data[['month', "day", 'passenger_count']].groupby(['month', "day"]).sum().reset_index().pivot(columns = "month", index = "day", values = "passenger_count"))
<AxesSubplot: title={'center': 'Total durations of taxi passengers for everyday of each month'}, xlabel='month', ylabel='day'>
plt.figure(figsize=(11, 7))
plt.title("Average number taxi passengers for everyday of each month")
sb.heatmap(data[['month', "day", 'passenger_count']].groupby(['month', "day"]).mean().reset_index().pivot(columns = "month", index = "day", values = "passenger_count"))
<AxesSubplot: title={'center': 'Average number taxi passengers for everyday of each month'}, xlabel='month', ylabel='day'>
plt.figure(figsize=(11, 7))
plt.title("Number of taxi passengers for everyday of each month")
plt.ylim(1.5, 1.75)
sb.barplot(data[['month', "day", 'passenger_count']].groupby(['month', "day"]).mean().reset_index().pivot(columns = "month", index = "day", values = "passenger_count"))
<AxesSubplot: title={'center': 'Number of taxi passengers for everyday of each month'}, xlabel='month'>
plt.figure(figsize=(11, 7))
plt.title("Number of taxi passengers for everyday of each month")
plt.ylim(1.5, 1.75)
sb.barplot(data[['month', "day", 'passenger_count']].groupby(['month', "day"]).mean().reset_index(), x = "day", y = "passenger_count")
<AxesSubplot: title={'center': 'Number of taxi passengers for everyday of each month'}, xlabel='day', ylabel='passenger_count'>
plt.figure(figsize=(11, 7))
sb.heatmap(data[['month', "day", '']].groupby(['month', "day"]).mean().reset_index().pivot(columns = "month", index = "day", values = "duration_secs"))
plt.title("Average taxi trip duration (in seconds) for everyday of each month")
data[['RatecodeID', "fare_amount"]].groupby('RatecodeID').agg(["count", 'mean'])["fare_amount"].reset_index().style.background_gradient()
| RatecodeID | count | mean | |
|---|---|---|---|
| 0 | 1 | 22070 | 11.807340 |
| 1 | 2 | 513 | 52.000000 |
| 2 | 3 | 39 | 61.961538 |
| 3 | 4 | 8 | 73.875000 |
| 4 | 5 | 68 | 78.570000 |
| 5 | 99 | 1 | 77.200000 |
plt.title("Number of taxi passengers charged each RatecodeID")
sb.barplot(data[['RatecodeID', "fare_amount"]].groupby('RatecodeID').agg(["count", 'mean'])["fare_amount"].reset_index(), x = "RatecodeID", y = "count")
<AxesSubplot: title={'center': 'Number of taxi passengers charged each RatecodeID'}, xlabel='RatecodeID', ylabel='count'>
plt.title("Number of taxi passengers charged each RatecodeID")
sb.barplot(data[['RatecodeID', "fare_amount"]].groupby('RatecodeID').agg(["count", 'mean'])["fare_amount"].reset_index(), x = "RatecodeID", y = "mean")
<AxesSubplot: title={'center': 'Number of taxi passengers charged each RatecodeID'}, xlabel='RatecodeID', ylabel='mean'>
The functions below attempt to replicate the correlation_matrix function from R
cols = ['trip_distance', 'fare_amount', 'tip_amount','tolls_amount', 'total_amount']
def corrdot(*args, **kwargs):
corr_r = args[0].corr(args[1], 'pearson')
corr_text = round(corr_r, 2)
ax = plt.gca()
font_size = abs(corr_r) * 80 + 5
ax.annotate(corr_text, [.5, .5,], xycoords = "axes fraction",
ha ='center', va ='center', fontsize = font_size)
def corrfunc(x, y, **kws):
r, p = stats.pearsonr(x, y)
p_stars = ''
if p <= 0.05:
p_stars = '*'
if p <= 0.01:
p_stars = '**'
if p <= 0.001:
p_stars = '***'
ax = plt.gca()
ax.annotate(p_stars, xy = (0.65, 0.6), xycoords = ax.transAxes,
color = 'red', fontsize = 70)
sb.set(style='white', font_scale=1.6)
np.seterr(invalid = 'ignore')
g = sb.PairGrid(data[cols], aspect=1.5, diag_sharey=False, despine=False)
g.map_lower(sb.regplot, lowess = True, ci=False,
line_kws={'color': 'red', 'lw': 1},
scatter_kws={'color': 'black', 's': 20})
g.map_diag(sb.histplot, color = 'black', edgecolor = 'k', facecolor ='grey',
kde = True, kde_kws = {'cut': 0.7}, line_kws = {"color": 'red'})
g.map_diag(sb.rugplot, color = 'black')
g.map_upper(corrdot)
g.map_upper(corrfunc)
g.fig.subplots_adjust(wspace = 0, hspace = 0)
# Remove axis labels
for ax in g.axes.flatten():
ax.set_ylabel('')
ax.set_xlabel('')
# Add titles to the diagonal axes/subplots
for ax, col in zip(np.diag(g.axes), data[cols].columns):
ax.set_title(col, y=0.82, fontsize=26)
Comprehensive summary of the data, variables and their interactions
pr = ProfileReport(data)
# Profile Report should be here somewhere, uncomment below to plot it.
# It slows down the file so I cleared it, temporarily
# I only report the original variables... no reason really
#pr
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
pr.to_file("taxi duration.html")
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
# The store_and_fwd_flag var seems to have no predictable power, and it is extremely unbalanced
data.drop(["Unnamed: 0", "VendorID", "RatecodeID", "PULocationID", "DOLocationID", "payment_type", "week"], axis = 1).groupby("store_and_fwd_flag").describe()
| passenger_count | trip_distance | ... | day_num | hour | |||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | ... | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| store_and_fwd_flag | |||||||||||||||||||||
| N | 22600.0 | 1.644115 | 1.287143 | 0.0 | 1.0 | 1.0 | 2.0 | 6.0 | 22600.0 | 2.911544 | ... | 5.0 | 6.0 | 22600.0 | 13.725531 | 6.228109 | 0.0 | 9.0 | 14.0 | 19.0 | 23.0 |
| Y | 99.0 | 1.232323 | 0.603194 | 1.0 | 1.0 | 1.0 | 1.0 | 4.0 | 99.0 | 3.317172 | ... | 4.0 | 6.0 | 99.0 | 13.959596 | 5.405815 | 0.0 | 11.0 | 15.0 | 19.0 | 23.0 |
2 rows × 112 columns
sb.scatterplot(data[["trip_distance", "duration_secs" ]], x = "trip_distance", y = "duration_secs")
<AxesSubplot: xlabel='trip_distance', ylabel='duration_secs'>
data["duration_secs"].corr(data["trip_distance"])
0.15360834125205425
There seems to be very little correlation between the two variables. However there are two different clusters, let's investigate!
We'll start with relationship when trips take longer that 80,000 seconds (>22 hours)
sb.scatterplot(data[data.duration_secs>80000][["trip_distance", "duration_secs" ]], x = "trip_distance", y = "duration_secs")
<AxesSubplot: xlabel='trip_distance', ylabel='duration_secs'>
data[data.duration_secs>80000]["duration_secs"].corr(data[data.duration_secs>80000]["trip_distance"])
0.26355866066221384
There is a weak positive relationship betwen trip duration and distance for trips longer than 22 hours, maybe these people just drive a little and the relax with their drivers, or they got murdered, or the drivers forgot to switch of the meter...the possibilities are endless.
Lets see what happens when the trips last less than 20000 seconds (~ <5 hours 30 minues)
sb.scatterplot(data[data.duration_secs<20000][["trip_distance", "duration_secs" ]], x = "trip_distance", y = "duration_secs")
<AxesSubplot: xlabel='trip_distance', ylabel='duration_secs'>
data[data.duration_secs<20000]["duration_secs"].corr(data[data.duration_secs<20000]["trip_distance"])
0.776168297801309
Well what do you know, suddenly a strong positive relationship, this is what I expected from the overall dataset
The distribution of our interest variable is crazy positively skewed, I'll use natural log transformation below to fix it
sb.histplot(data.duration_secs)
<AxesSubplot: xlabel='duration_secs', ylabel='Count'>
data[data.duration_secs < 0][['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime','tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'payment_type', 'total_amount', 'fare_amount']]
# This observation has a negative duration; they were picked up before they were dropped off;
# Below I remove that observation, along with observation with 0 as duration, to be able to use a log transformation
| Unnamed: 0 | VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | payment_type | total_amount | fare_amount | |
|---|---|---|---|---|---|---|---|---|---|
| 9356 | 93542707 | 1 | 2017-11-05 01:23:08 | 2017-11-05 01:06:09 | 1 | 5.7 | 3 | 29.3 | 28.0 |
data.drop(axis = 0 , index = data[data.duration_secs <= 0].index, inplace = True, errors = 'ignore')
# in hindsight I should have removed the 14 observations with negative values... they are not worth this trouble :-)
# Creating z-score to try to pin point outliers
data["duration_secs_log"] = np.log(data.duration_secs)
data["duration_secs_log_z"] = stats.zscore(data["duration_secs_log"])
Normal is the new normal
sb.histplot(data.duration_secs_log)
# the log-transformed duration is very normal, with some outliers
<AxesSubplot: xlabel='duration_secs_log', ylabel='Count'>
# sample some outliers, using 3 zscore as the boundary for outliers
data[data["duration_secs_log_z"] >= 3].sample(5)
| Unnamed: 0 | VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | store_and_fwd_flag | PULocationID | DOLocationID | ... | day | month | tpep_pickup_time | month_num | day_num | hour | period_of_day | time_of_day | duration_secs_log | duration_secs_log_z | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9208 | 79635338 | 2 | 2017-09-22 09:20:53 | 2017-09-23 09:04:02 | 1 | 1.82 | 1 | N | 158 | 100 | ... | Fri | Sep | 09:20:53 | 8 | 4 | 9 | My people | 09:00 - 13:00 | 11.354973 | 5.725934 |
| 15579 | 41838754 | 2 | 2017-05-10 18:53:53 | 2017-05-11 18:53:02 | 5 | 0.74 | 1 | N | 161 | 162 | ... | Wed | May | 18:53:53 | 4 | 2 | 18 | night travellers | 17:00 - 21:00 | 11.366153 | 5.739057 |
| 1355 | 31453899 | 2 | 2017-04-17 21:26:49 | 2017-04-18 20:46:13 | 6 | 4.09 | 1 | N | 100 | 13 | ... | Mon | Apr | 21:26:49 | 3 | 0 | 21 | Late night | 21:00 - 01:00 | 11.338143 | 5.706181 |
| 21511 | 56793229 | 2 | 2017-07-02 15:45:27 | 2017-07-03 15:41:54 | 1 | 4.98 | 1 | N | 87 | 49 | ... | Sun | Jul | 15:45:27 | 6 | 6 | 15 | Afternoon rush | 13:00 - 17:00 | 11.364275 | 5.736853 |
| 11672 | 22404356 | 2 | 2017-03-18 14:58:31 | 2017-03-19 14:31:35 | 3 | 3.32 | 1 | N | 230 | 144 | ... | Sat | Mar | 14:58:31 | 2 | 5 | 14 | Afternoon rush | 13:00 - 17:00 | 11.347862 | 5.717588 |
5 rows × 31 columns
# lets check out them outliers on the left side
sb.histplot(data[data.duration_secs_log_z <= -3].duration_secs_log)
<AxesSubplot: xlabel='duration_secs_log', ylabel='Count'>
# and on the right
sb.histplot(data[data.duration_secs_log_z >= 3].duration_secs_log)
<AxesSubplot: xlabel='duration_secs_log', ylabel='Count'>
sb.scatterplot(data[(data.duration_secs_log_z < 3) & (data.duration_secs_log_z > -3)], x = 'duration_mins', y = 'trip_distance')
# The relationship between the trip duration and distance is wild,
# looks like the residual of a horrible linear regression model... #heteroskedasticity #crazyVarianceThings
<AxesSubplot: xlabel='duration_mins', ylabel='trip_distance'>
I'm curious if our new normal dataset follows the empirical rule for normal distributions
# 68%
f"About {((data.duration_secs_log_z >-1) & (data.duration_secs_log_z < 1)).mean():.2%} of the data falls within 1 standard deviation of the mean, compared to 68% for a normal distribution"
'About 74.27% of the data falls within 1 standard deviation of the mean, compared to 68% for a normal distribution'
# 95%
f"{((data.duration_secs_log_z >-2) & (data.duration_secs_log_z < 2)).mean():.2%} of the data falls within 2 standard deviation of the mean, compared to 95% for a normal distribution"
'96.79% of the data falls within 2 standard deviation of the mean, compared to 95% for a normal distribution'
# 99.7%
f"{((data.duration_secs_log_z >-3) & (data.duration_secs_log_z < 3)).mean():.2%} of the data falls within 3 standard deviation of the mean, compared to 99.7% for a normal distribution"
'99.09% of the data falls within 3 standard deviation of the mean, compared to 99.7% for a normal distribution'
I mean....barely, but close enough
Just playing around with the clt to see if ya'll aren't lying to us!
This treat variables in the dataset as the population, makes a certain number of samples with sample size > 30 to see if the theorem holds
data.duration_secs.median()
671.0
data.duration_secs.mean()
1022.0872441778405
data.duration_secs_log.median()
6.508769136971682
data.duration_secs_log.mean()
6.476605878772017
sb.boxplot(data.duration_secs_log)
<AxesSubplot: >
number_of_samples = 1500
sample_size = 50
dic_list = []
for x in range(number_of_samples):
samp = data.duration_secs.sample(sample_size)
item = {"mean":samp.mean(), "std":samp.std()}
dic_list.append(item)
sample_dist = pd.DataFrame(dic_list)
sb.histplot(sample_dist["mean"])
print(f"Mean: { sample_dist['mean'].mean():.4f} \nStandard Error: {sample_dist['mean'].std():.4f}")
Mean: 1006.1934 Standard Error: 497.4321
I mean................. regardless of the sample size or number of samples, there's always at least two (far apart) modes, up to 4 modes. There should just be one mode and approximately normal distibution.
sample_dist["lower"] = sample_dist['mean'] - (1.96* (sample_dist['std']/math.sqrt(sample_size)))
sample_dist["upper"] = sample_dist['mean'] + (1.96* (sample_dist['std']/math.sqrt(sample_size)))
sample_dist.head()
# checks how many of the above sample CIs include the population mean
perc = ((sample_dist["lower"] <= data.duration_secs.mean()) & (sample_dist["upper"] >= data.duration_secs.mean())).mean()
f"95% of confidence should contain the population mean. In this instance, about {perc:.2%} of the {number_of_samples} confidence intervals contain the population mean."
'95% of confidence should contain the population mean. In this instance, about 59.33% of the 1500 confidence intervals contain the population mean.'
sb.histplot(data['trip_distance'])
<AxesSubplot: xlabel='trip_distance', ylabel='Count'>
print(f"Mean: {data['trip_distance'].mean():.4f} \nStandard Deviation: {data['trip_distance'].std():.4f}")
Mean: 2.9165 Standard Deviation: 3.6540
number_of_samples = 1500
sample_size = 30
dic_list = []
for x in range(number_of_samples):
samp = data.trip_distance.sample(sample_size)
item = {"mean":samp.mean(), "std":samp.std()}
dic_list.append(item)
sample_dist = pd.DataFrame(dic_list)
sb.histplot(sample_dist["mean"])
print(f"Mean: { sample_dist['mean'].mean():.4f} \nStandard Error: {sample_dist['mean'].std():.4f}")
Mean: 2.9119 Standard Error: 0.6689
sample_dist["lower"] = sample_dist['mean'] - (1.96* (sample_dist['std']/math.sqrt(sample_size)))
sample_dist["upper"] = sample_dist['mean'] + (1.96* (sample_dist['std']/math.sqrt(sample_size)))
perc = ((sample_dist["lower"] <= data.trip_distance.mean()) & (sample_dist["upper"] >= data.trip_distance.mean())).mean()
f"95% of confidence should contain the population mean. In this instance, about {perc:.2%} of the {number_of_samples} confidence intervals contain the population mean."
'95% of confidence should contain the population mean. In this instance, about 86.27% of the 1500 confidence intervals contain the population mean.'
from scipy import stats
stats.norm.interval(confidence = 0.95, loc = 2.591, scale = 3.684842/math.sqrt(30))
(1.2724204546153968, 3.9095795453846036)
Below are two variables of interest which contain a huge amount of outliers, including the dependent variable
Trip duration (in seconds) distribution with outliers
sb.boxplot(data.duration_secs)
<AxesSubplot: >
sb.boxplot(data.total_amount)
<AxesSubplot: >
# remove negative values
cols = data.select_dtypes(include = "number").columns
for col in cols:
data.loc[data[col] < 0, col] = 0
from scipy.stats import zscore
# Ended up not using z scores because they still left many outliers
# In hindsight this would have been quicker if I'd just written a function to do than do this manually, individually
# We're correcting total amount, fare amount, trip distance, and duration in seconds
data["total_amount_z"] = zscore(data.total_amount)
data["duration_secs_z"] = zscore(data.duration_secs)
# Put a cap on both values, max value is now Q3 + IQR
# Total amount outliers
total_amount_upper = math.ceil(data["total_amount"].quantile(0.75) + 1.5*(data["total_amount"].quantile(0.75) -
data["total_amount"].quantile(0.25)))
total_amount_lower = math.floor(data["total_amount"].quantile(0.25) - 1.5*(data["total_amount"].quantile(0.75) -
data["total_amount"].quantile(0.25)))
data.loc[data.total_amount < 0, "total_amount"] = 0
data.loc[data.total_amount > total_amount_upper, "total_amount"] = total_amount_upper
# Trip distance outliers
trip_distance_upper = math.ceil(data["trip_distance"].quantile(0.75) + 1.5*(data["trip_distance"].quantile(0.75) -
data["trip_distance"].quantile(0.25)))
trip_distance_lower = math.floor(data["trip_distance"].quantile(0.25) - 1.5*(data["trip_distance"].quantile(0.75) -
data["trip_distance"].quantile(0.25)))
data.loc[data.trip_distance < 0, "trip_distance"] = 0
data.loc[data.trip_distance > trip_distance_upper, "trip_distance"] = trip_distance_upper
# Fare amount outliers
fare_amount_upper = math.ceil(data["fare_amount"].quantile(0.75) + 1.5*(data["fare_amount"].quantile(0.75) -
data["fare_amount"].quantile(0.25)))
fare_amount_lower = math.floor(data["fare_amount"].quantile(0.25) - 1.5*(data["fare_amount"].quantile(0.75) -
data["fare_amount"].quantile(0.25)))
data.loc[data.fare_amount < 0, "fare_amount"] = 0
data.loc[data.fare_amount > fare_amount_upper, "total_amount"] = fare_amount_upper
# Duration outliers
duration_secs_upper = math.ceil(data["duration_secs"].quantile(0.75) + 1.5*(data["duration_secs"].quantile(0.75) -
data["duration_secs"].quantile(0.25)))
duration_secs_lower = math.floor(data["duration_secs"].quantile(0.25) - 1.5*(data["duration_secs"].quantile(0.75) -
data["duration_secs"].quantile(0.25)))
data.loc[data.duration_secs < 0, "duration_secs"] = 0
data.loc[data.duration_secs > duration_secs_upper, "duration_secs"] = duration_secs_upper
sb.boxplot(data.duration_secs)
<AxesSubplot: >
sb.boxplot(data.total_amount)
<AxesSubplot: >
Do the customers who use a credit card pay higher fare amounts than those who use cash?
That said, the TLC team is asking us to consider the following:
EDA
data.fare_amount.describe()
count 22699.000000 mean 13.026629 std 13.243791 min -120.000000 25% 6.500000 50% 9.500000 75% 14.500000 max 999.990000 Name: fare_amount, dtype: float64
data.payment_type.value_counts()
1 15265 2 7267 3 121 4 46 Name: payment_type, dtype: int64
sb.boxplot(data = data, y = "fare_amount", x = "payment_cats", showfliers = False)
<AxesSubplot: xlabel='payment_cats', ylabel='fare_amount'>
data[["payment_cats", "fare_amount"]].groupby('payment_cats').describe()
| fare_amount | ||||||||
|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | |
| payment_cats | ||||||||
| Cash | 7267.0 | 12.213546 | 11.689940 | 0.0 | 6.0 | 9.0 | 14.000 | 450.00 |
| Credit card | 15265.0 | 13.429748 | 13.848964 | 0.0 | 7.0 | 9.5 | 15.000 | 999.99 |
| Dispute | 46.0 | 9.913043 | 24.162943 | -120.0 | 5.0 | 8.5 | 17.625 | 52.00 |
| No charge | 121.0 | 12.186116 | 14.894232 | -4.5 | 2.5 | 7.0 | 15.000 | 65.50 |
sb.scatterplot(x = data.trip_distance, y = data.duration_secs, hue = data.payment_cats)
<AxesSubplot: xlabel='trip_distance', ylabel='duration_secs'>
#fig = plt.figure()
#ax = fig.add_subplot(projection ='3d')
fig = px.scatter_3d(data, x = 'trip_distance', y = 'duration_secs', z = 'total_amount', color = data.payment_cats)
fig.show()
Testing
Student's t-test for equality of means
H0: There is no difference in the amounts paid by credit card and cash users
H1: Customers who use a credit card pay higher fare amounts
Significance level: 5%
cash = data[data.payment_cats == 'Cash'].sample(1000, replace = True, random_state = 28)
credit = data[data.payment_cats == 'Credit card'].sample(1000, replace = True, random_state = 28)
print(f"cash: {data[data.payment_cats == 'Cash'].fare_amount.mean()}\nCredit: {data[data.payment_cats == 'Credit card'].fare_amount.mean()}")
cash: 12.21354616760699 Credit: 13.429747789059942
print(f"cash: {cash.fare_amount.mean()}\nCredit: {credit.fare_amount.mean()}")
cash: 12.368 Credit: 12.9345
stats.ttest_ind(cash.fare_amount, credit.fare_amount, equal_var = False)
Ttest_indResult(statistic=-1.242182632543156, pvalue=0.2143152228089723)
f'The p-value {stats.ttest_ind(cash.fare_amount, credit.fare_amount, equal_var = False).pvalue:.2%}'
'The p-value 21.43%'
Conclusion: The p-value is greater than the significance level, hence we fail to reject the null hypothesis.
The probability that any difference in the fare amounts paid by credit card and cash users exists only by chance is fairly high, there is no statistical evidence to suggest there is any difference in the fares paid by the two groups.
There is no need to target marketing efforts towards users of any particular payment type.
H0: There is no difference in the amounts paid using any of the payment types
H1: At least one of the payment types has fare amounts different from the others
model_anova = ols(data = data, formula = "fare_amount ~ C(payment_type)").fit()
sm.stats.anova_lm(model_anova, typ = 2)
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(payment_type) | 7.816301e+03 | 3.0 | 14.881664 | 1.124138e-09 |
| Residual | 3.973367e+06 | 22695.0 | NaN | NaN |
The p-value is significantly lower than the 5% significance level, hence probability of seeing an F-stat as or more extreme than 14.881664 is 0, we may reject the null hypothesis and conclude that the average fare amount paid by customers using the available payment methods is not the different.
This is to acertain the pair of payment types whose means are not the same.
H0: Mean fares paid by the customers is the same for each pair of payment type
H1: The mean amount paid by the customer is different for at least one pair of payment types
tukey = pairwise_tukeyhsd(endog = data.fare_amount, groups = data.payment_cats, alpha = 0.05)
tukey.summary()
| group1 | group2 | meandiff | p-adj | lower | upper | reject |
|---|---|---|---|---|---|---|
| Cash | Credit card | 1.2162 | 0.0 | 0.629 | 1.8034 | True |
| Cash | Dispute | -2.3005 | 0.6424 | -8.394 | 3.793 | False |
| Cash | No charge | -0.0274 | 1.0 | -3.8038 | 3.7489 | False |
| Credit card | Dispute | -3.5167 | 0.2734 | -9.6002 | 2.5668 | False |
| Credit card | No charge | -1.2436 | 0.7319 | -5.0037 | 2.5165 | False |
| Dispute | No charge | 2.2731 | 0.7541 | -4.8631 | 9.4092 | False |
Tukeys says there is no difference in the pairwise means of all the payment types except for cash and credit cards, which is in stark contast to the results of the student's t-test. The difference in the outcomes of the tests is solely due to the use of a smaller, random sub-sample for the t-test and using the entire dataset for the anova and tukey tests. Using a much larger sub-sample for the t-tests confirms the results of the latter two tests.
stats.ttest_ind(data[data.payment_cats == "Cash"].fare_amount.sample(7000, replace = True, random_state = 28), data[data.payment_cats == "Credit card"].fare_amount.sample(7000, replace = True, random_state = 28), equal_var = False)
Ttest_indResult(statistic=-4.485339416872243, pvalue=7.34308239466814e-06)
data.columns
Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
'tolls_amount', 'improvement_surcharge', 'total_amount', 'payment_cats',
'duration_secs', 'duration_mins', 'week', 'day', 'month',
'tpep_pickup_time', 'month_num', 'day_num', 'hour', 'period_of_day',
'time_of_day', 'time', 'minutes'],
dtype='object')
model_hour = ols(data = data, formula = "duration_secs ~ C(hour)").fit()
sm.stats.anova_lm(model_hour, typ = 2)
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(hour) | 2.714466e+08 | 23.0 | 0.852816 | 0.664989 |
| Residual | 3.137969e+11 | 22675.0 | NaN | NaN |
Goodness of fit:
H0: Trip durations are the same for each month, day, and time on average.
H1: Trip durations are not the same across these periods
# Duration of trips are the same for each month.
# Calculating the expected monthly durations
month_gof = data[["month", "duration_secs"]].groupby("month").mean().reset_index().copy()
month_gof["duration_exp"] = data.duration_secs.mean()
month_gof.style.background_gradient(cmap="Reds")
| month | duration_secs | duration_exp | |
|---|---|---|---|
| 0 | Jan | 825.831247 | 1020.826600 |
| 1 | Feb | 904.775014 | 1020.826600 |
| 2 | Mar | 983.993655 | 1020.826600 |
| 3 | Apr | 1128.545319 | 1020.826600 |
| 4 | May | 1037.839046 | 1020.826600 |
| 5 | Jun | 1157.679735 | 1020.826600 |
| 6 | Jul | 1081.131408 | 1020.826600 |
| 7 | Aug | 897.383991 | 1020.826600 |
| 8 | Sep | 1013.831027 | 1020.826600 |
| 9 | Oct | 922.344351 | 1020.826600 |
| 10 | Nov | 998.272382 | 1020.826600 |
| 11 | Dec | 1296.436393 | 1020.826600 |
month_chi = stats.chisquare(month_gof.duration_secs, month_gof.duration_exp, axis=0)
month_chi.pvalue()
# scipy is being childish so I'll finish this later... or not probably
data[["day", "duration_secs"]].groupby("day").mean().reset_index().style.background_gradient(cmap="Reds")
| day | duration_secs | |
|---|---|---|
| 0 | Mon | 925.504606 |
| 1 | Tue | 916.036898 |
| 2 | Wed | 1090.023599 |
| 3 | Thu | 1113.103762 |
| 4 | Fri | 1072.891591 |
| 5 | Sat | 975.732997 |
| 6 | Sun | 1034.213476 |
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
# Create a copy of original dataset, standadize predictors,
# split X and Y variables, the split training and testing sets
# We're skipping validation and cross-validation for this model
df = data.copy()
Y = df[["duration_secs"]]
X = df[["Unnamed: 0",'trip_distance','RatecodeID','payment_type', 'extra', 'mta_tax', 'tip_amount','tolls_amount', 'improvement_surcharge']]
X = X.loc[:, ~X.columns.str.contains("Unnamed")]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state = 28)
X_train = pd.DataFrame(X_train, columns = X.columns)
X_test = pd.DataFrame(X_test, columns = X.columns)
X_train
| trip_distance | RatecodeID | payment_type | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | |
|---|---|---|---|---|---|---|---|---|
| 0 | -0.647159 | -0.139036 | -0.677904 | 0.360332 | 0.064119 | -0.299011 | -0.223514 | 0.028072 |
| 1 | -0.896884 | -0.139036 | 1.339082 | -0.719283 | 0.064119 | -0.656009 | -0.223514 | 0.028072 |
| 2 | 0.126990 | -0.139036 | -0.677904 | -0.719283 | 0.064119 | 0.754133 | -0.223514 | 0.028072 |
| 3 | 0.052072 | -0.139036 | -0.677904 | 0.360332 | 0.064119 | 0.432835 | -0.223514 | 0.028072 |
| 4 | 2.299600 | -0.139036 | -0.677904 | 0.360332 | 0.064119 | 1.739447 | 3.890853 | 0.028072 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18132 | -0.986786 | -0.139036 | -0.677904 | 0.360332 | 0.064119 | -0.141932 | -0.223514 | 0.028072 |
| 18133 | 1.850095 | -0.139036 | -0.677904 | 1.439947 | 0.064119 | 1.039731 | -0.223514 | 0.028072 |
| 18134 | -0.966808 | -0.139036 | 1.339082 | -0.719283 | 0.064119 | -0.656009 | -0.223514 | 0.028072 |
| 18135 | -0.831956 | -0.139036 | 1.339082 | -0.719283 | 0.064119 | -0.656009 | -0.223514 | 0.028072 |
| 18136 | -0.727071 | -0.139036 | 1.339082 | -0.719283 | 0.064119 | -0.656009 | -0.223514 | 0.028072 |
18137 rows × 8 columns
# Build and fit model to the training data
model_lr = LinearRegression()
model_lr.fit(X_train,Y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
r_sq = model_lr.score(X_train, Y_train)
print(f"Coefficient of determination: {r_sq:.2%}")
Y_pred = model_lr.predict(X_train)
print(f"R^2: {r2_score(Y_train, Y_pred):.2%}")
print(f"MAE: {mean_absolute_error(Y_train,Y_pred)}")
print(f"RMSE:{np.sqrt(mean_squared_error(Y_train, Y_pred))}")
Coefficient of determination: 51.19% R^2: 51.19% MAE: 0.39724078250029016 RMSE:0.5595480873756041
r_sq_test = model_lr.score(X_test, Y_test)
print("Coefficient of determination:", r_sq_test)
Y_pred_test = model_lr.predict(X_test)
print("R^2:", r2_score(Y_test, Y_pred_test))
print("MAE:", mean_absolute_error(Y_test,Y_pred_test))
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_test)))
Coefficient of determination: 0.6577045130990453 R^2: 0.6577045130990453 MAE: 239.21455013245085 RMSE: 323.27013412163654
'''import statsmodels
formula = "data.duration_secs ~ 'data.trip_distance'+ 'data.RatecodeID'+'data.payment_type'+'data.extra'+'data.mta_tax'+ 'data.tip_amount'+'data.tolls_amount'+ 'data.improvement_surcharge'"
model_ols = ols(data = data, formula = formula)
model_ols.fit().summary()
model_res = model_ols.fit_regularized(method = "elastic_net")
pinv_wexog, _ = pinv_extended(model_ols.wexog)
normalized_cov_params = np.dot(pinv_wexog, np.transpose(pinv_wexog))
summary = statsmodels.regression.linear_model.OLSResults(model_ols, model_res.params, normalized_cov_params)
summary.summary()'''
results = pd.DataFrame(data={"actual": Y_test["duration_secs"],
"predicted": Y_pred_test.ravel()})
results["residual"] = results["actual"] - results["predicted"]
results.head()
| actual | predicted | residual | |
|---|---|---|---|
| 146 | 520.0 | 598.243338 | -78.243338 |
| 19350 | 283.0 | 431.842645 | -148.842645 |
| 20011 | 515.0 | 550.199015 | -35.199015 |
| 13009 | 1620.0 | 1470.542291 | 149.457709 |
| 3822 | 674.0 | 506.622903 | 167.377097 |
sb.set(style='whitegrid')
plt.figure(figsize = (12,12))
sb.regplot(x="actual", y="predicted", data = results)
#plt.show()
<AxesSubplot: xlabel='actual', ylabel='predicted'>
plt.hist(results["residual"], bins=30)
plt.title("Distribution of the residuals")
plt.xlabel("residual value")
plt.ylabel("count")
# We're happy our errors are normally distributed
Text(0, 0.5, 'count')
sb.scatterplot(x = "predicted", y = "residual", data = results)
plt.axhline(0)
plt.title("Scatterplot of residuals over predicted values")
plt.xlabel("predicted value")
plt.ylabel("residual value")
# residuals are a cloud of nonsense, yay!!
import explainerdashboard
from explainerdashboard import RegressionExplainer, ClassifierExplainer, ExplainerDashboard
feature_descriptions = {'trip_distance': "Duration of the trip in miles",
'RatecodeID': "Category of the tarrif being used and charged",
'payment_type': "The payment method used by the passenger",
'extra': "Extras",
'mta_tax': "Tax",
'tip_amount': "Tip paid by passenger",
'tolls_amount': "Amount paid at toll",
'improvement_surcharge': ""}
explainer = RegressionExplainer(model_lr, X_test, Y_test, units = "s", descriptions = feature_descriptions
)
dashboard = ExplainerDashboard(explainer, title = "Taxi Trip Duration Estimator")
dashboard.run()
WARNING: For shap='linear', shap interaction values can unfortunately not be calculated!
Warning: shap values for shap.LinearExplainer get calculated against X_background, but paramater X_background=None, so using X instead
Generating self.shap_explainer = shap.LinearExplainer(modelX)...
Building ExplainerDashboard..
WARNING: the number of idxs (=4540) > max_idxs_in_dropdown(=1000). However with your installed version of dash(2.9.3) dropdown search may not work smoothly. You can downgrade to `pip install dash==2.6.2` which should work better for now...
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...
Calculating predictions...
Calculating residuals...
Calculating absolute residuals...
Warning: mean-absolute-percentage-error is very large (2101690679724542.8), you can hide it from the metrics by passing parameter show_metrics...
Warning: mean-absolute-percentage-error is very large (2101690679724542.8), you can hide it from the metrics by passing parameter show_metrics...
Warning: mean-absolute-percentage-error is very large (2101690679724542.8), you can hide it from the metrics by passing parameter show_metrics...
Calculating dependencies...
Calculating importances...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard on http://10.0.0.107:8050
Dash is running on http://0.0.0.0:8050/
* Serving Flask app 'explainerdashboard.dashboards'
* Debug mode: off
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
* Running on all addresses (0.0.0.0)
* Running on http://127.0.0.1:8050
* Running on http://10.0.0.107:8050
Press CTRL+C to quit
10.0.0.107 - - [13/May/2023 20:47:30] code 400, message Bad request version ('\x00\x02\x01\x00\x00+\x00\x07\x06ºº\x03\x04\x03\x03\x00\x12\x00\x00\x00')
äëÙëÊõù÷Eå_Di h2 + ºº ü5¯j[K´ô"òàML
Ú÷íÕµ~'T±µpY ÇqCaR¢ê|Î<èØåG¯%`ez(ºÉ"